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World Population Analysis 


In [65]: import numpy as np 
import pandas as pd 
import seaborn as sns 
from plotly.subplots import make_subplots 
import matplotlib.pyplot as plt 
from itables import init_notebook_mode 
from itables import show 
import plotly.express as px 
from plotly.offline import iplot, init_notebook_mode 


from plotly.offline import download_plotlyjs,init_notebook_mode, pl 
init_notebook_mode(connected=True) 
cf.go offline() 


import warnings 
warnings.filterwarnings('ignore') 


Importing Dataset 


In [45]: #loading dataset in pandas 
data = pd.read csv('world population.csv') 


In [46]: #check first five rows 
data.head() 


Out [46]: 


2022 2020 2015 


Rank CCAS Country/Territory Capital Continent Population Population Population | 


0 36  AFG Afghanistan Kabul Asia 41128771 38972230 33753499 


1 138 ALB Albania Tirana Europe 2842321 2866849 2882481 

2 34 DZA Algeria Algiers Africa 44903225 43451666 39543154 
: Pago ; 

3 213 ASM American Samoa Pago Oceania 44273 46189 51368 

4 203 AND Andorra Pe Europe 79824 77700 71746 
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In [47]: #check last five rows 
data.tail() 
Out [47]: 
. l) S 2022 2020 2015 
Rank CCA3 Country/Territory Capital Continent Population Population Population 
229 226 WLF Wallis and Futuna ir Oceania 11572 11655 12182 
230 172 ESH Western Sahara me Africa 575986 556048 491824 
231 46 YEM Yemen Sanaa Asia 33696614 32284046 28516545 
232 63 ZMB Zambia Lusaka Africa — 20017675 18927715 16248230 
233 74 ZWE Zimbabwe Harare Africa 16320537 15669666 14154937 
In [48]: #check shape 
data.shape 
Out [48]: (234, 17) 
In [49]: #check more info 
data. info() 
«class 'pandas.core.frame.DataFrame'> 
RangeIndex: 234 entries, 0 to 233 
Data columns (total 17 columns): 
# Column Non-Null Count Dtype 
0 Rank 234 non-null int64 
1  CCA3 234 non-null object 
2  Country/Territory 234 non-null object 
3 Capital 234 non-null object 
4 Continent 234 non-null object 
5 2022 Population 234 non-null int64 
6 2020 Population 234 non-null int64 
7 2015 Population 234 non-null int64 
8 2010 Population 234 non-null int64 
9 2000 Population 234 non-null int64 
10 31990 Population 234 non-null int64 
11 1980 Population 234 non-null int64 
12 1970 Population 234 non-null int64 
13 Area (km?) 234 non-null int64 
14 Density (per km?) 234 non-null float64 
15 Growth Rate 234 non-null float64 
16 World Population Percentage 234 non-null float64 
dtypes: float64(3), int64(10), object(4) 
memory usage: 31.2+ KB 
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In [50]: 


Out [50] : 


count 
mean 
std 
min 
25% 
50% 
75% 


max 


Rank 


234.000000 
117.500000 
67.694165 
1.000000 
59.250000 
117.500000 
175.750000 


234.000000 


#mathmatic realtion 
data.describe() 


2022 
Population 


2.340000e+02 
3.407441e+07 
1.367664e+08 
5.100000e+02 
4.197385e+05 
5.559944e+06 
2.247650e+07 


1.425887e+09 


2020 
Population 


2.340000e+02 
3.350107e+07 
1.355899e+08 
5.200000e+02 
4.152845e+05 
5.493074e+06 
2.144798e+07 


1.424930e+09 


2015 
Population 


2.340000e+02 
3.172996e+07 
1.304050e+08 
5.640000e+02 
4.046760e+05 
5.307400e+06 
1.973085e+07 


1.393715e+09 
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2010 
Population 


2.340000e+02 
2.984524e+07 
1.242185e+08 
5.960000e+02 
3.931490e+05 
4.9427 70e+06 
1.915957e+07 


1.348191e+09 
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2000 
Population 


2.340000e+02 
2.626947e+07 
1.116982e+08 
6.510000e+02 
3.272420e+05 
4.292907e+06 
1.576230e+07 


1.264099e+09 
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#check corr realtion 
data.corr() 


Rank 


2022 
Population 


2020 
Population 


2015 
Population 


2010 
Population 


2000 
Population 


1990 
Population 


1980 
Population 


1970 
Population 


Area (km?) 


Density 
(per km?) 


Growth 
Rate 


World 
Population 
Percentage 


Rank 


1.000000 


-0.358361 


-0.355854 


-0.351222 


-0.347461 


-0.341057 


-0.336152 


-0.335246 


-0.335379 


-0.383774 


0.129436 


-0.224561 


-0.358464 


2022 
Population 


-0.358361 


1.000000 


0.999946 


0.999490 


0.998629 


0.994605 


0.987228 


0.980285 


0.973162 


0.453411 


-0.027618 


-0.020863 


0.999999 


2020 
Population 


-0.355854 


0.999946 


1.000000 


0.999763 


0.999105 


0.995583 


0.988724 


0.982121 


0.975254 


0.454993 


-0.027358 


-0.025116 


0.999944 


2015 
Population 


-0.351222 


0.999490 


0.999763 


1.000000 


0.999783 


0.997340 


0.991594 


0.985724 


0.979414 


0.458240 


-0.026857 


-0.032154 


0.999487 
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2010 
Population 


-0.347461 


0.998629 


0.999105 


0.999783 


1.000000 


0.998593 


0.993929 


0.988786 


0.983042 


0.461936 


-0.026505 


-0.037983 


0.998626 
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2000 
Population 


-0.341057 


0.994605 


0.995583 


0.997340 


0.998593 


1.000000 


0.998336 


0.995160 


0.990956 


0.473933 


-0.026139 


-0.050515 


0.994598 


19 
Populati: 


-0.3361 


0.9872 


0.9887 


0.9915 


0.9939 


0.9983 


1.0000 


0.9990 


0.9966 


0.4867 


-0.0262 


-0.0623 


0.9872 
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In [52]: #check missing values 
data. isnull().sum() 


Out [52]: Rank 
CCA3 
Country/Territory 
Capital 
Continent 
2022 Population 
2020 Population 
2015 Population 
2010 Population 
2000 Population 
1990 Population 
1980 Population 
1970 Population 
Area (km?) 
Density (per km?) 
Growth Rate 
World Population Percentage 
dtype: int64 


GO G GG G G G G G G G G G G G G O 


Data Processing 


In [53]: # renaming 'Country/Territory' to 'Country' 


df. rename(columns={'Country/Territory':'Country'}, inplace = True) 


In [54]: # renaming year columns from "Year Population" to just "Year" 


for col in df.columns: 
if 'Population' and 'Q' in col: 
df = data.rename(columns={col: col.split(' ')[@]}) 


df.head(3) 


Gut [54]; 


2022 2020 2015 


Rank A t it ital tinent 
ank “CCAS: County Termtory . Capital -Continen Population Population Population F 


(0) 36 AFG Afghanistan Kabul Asia 41128771 38972230 33753499 
1 138 ALB Albania Tirana Europe 2842321 2866849 2882481 
2 34 DZA Algeria Algiers Africa 44903225 43451666 39543154 
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In [5]: # let's check to see how many null objects we have in the dataset 


df.isnull().sum() 


Out[5]: Rank 
CCA3 
Country 
Capital 
Continent 
2022 
2020 
2015 
2010 
2000 
1990 
1980 
1970 
Area (km?) 
Density (per km?) 
Growth Rate 
World Population Percentage 
dtype: int64 


GO G G G G G G G G G G G G G OOO 


In [6]: # looking for duplicates 


df.duplicated().sum() 
Out[6]: 0 
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In [7]: # getting acquinted with our dataset 


In [8]: 
Out [8]: 


df.info() 


<class 'pandas.core.frame.DataFrame'> 
RangeIndex: 234 entries, 0 to 233 
Data columns (total 17 columns): 


# Column 


Country 
Capital 
Continent 
2022 

2020 

2015 

2010 

2000 

10 1990 

11 1980 

12 1970 

13 Area (km?) 
14 Density (per km?) 
15 Growth Rate 


(D GO 4 Gn Ln E LUM KA G 


234 


16 World Population Percentage 234 
dtypes: float64(3), int64(10), object(4) 


memory usage: 31.2+ KB 


df.nunique() 


Rank 

CCA3 

Country 

Capital 
Continent 

2022 

2020 

2015 

2010 

2000 

1990 

1980 

1970 

Area (km?) 
Density (per km?) 
Growth Rate 
World Population Percentage 
dtype: int64 


Non-Null Count 


non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
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Dtype 


float64 
float64 
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In [9]: 


In [10]: 


Out [10]: 


in [11]: 


Out (Ti): 
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Column names that are integers (Such as the years) might introduce some confusion. For 
example, when we are referencing the year 2010, one might confuse that when the 
2010th positional index. 


To avoid this ambuigity, let's convert the column names into strings: '1970' to '2010'. 


# converting the column names into strings 


df.columns 


Since we converted the years to string, let's declare a variable that will allow us to easily 


- list(map(str, df.columns)) 


call upon the full range of years 


years = list(map(str, (1970, 1980, 1990, 2000, 2010, 2015, 2020, 20 


years 


['1970', 


# let's view our statistical summary 


'1980', 


'1990', 


'2000', 


'2010', 


'2015', 


df.describe().T.sort values(ascending-0, by="'mean") 


2022 
2020 
2015 
2010 
2000 
1990 
1980 
1970 
Area (km?) 


Density 
(per km?) 


Rank 


Growth 
Rate 


World 
Population 
Percentage 


count 


234.0 


234.0 


234.0 


234.0 


234.0 


234.0 


234.0 


234.0 


234.0 


234.0 


234.0 


234.0 


234.0 


mean 


3.4074416+07 


3.350107e+07 


3.172996e«07 


2.984524e+07 


2.626947e+07 


2.271022e«07 


1.898462e+07 


1.578691e+07 


5.814494e+05 


4.521270e+02 


1.175000e+02 


1.009577e+00 


4.270513e-01 


std 
1.367664e408 
1.355899e+08 
1.304050e+08 
1.242185e408 
1.116982e408 
9.783217e+07 
8.178519e+07 
6.779509e+07 


1.761841 e+06 


2.066122e403 


6.769417e+01 


1.338498e-02 


1.714977e+00 


min 


510.0000 


520.0000 


564.0000 


596.0000 


651.0000 


700.0000 


733.0000 


752.0000 


1.0000 


0.0261 


1.0000 


0.9120 


0.0000 


25% 
419738.500000 
415284.500000 
404676.000000 
3931 49.000000 
327242.000000 
2641 15.750000 
229614.250000 
155997.000000 


2650.000000 


38.417875 


59.250000 


1.001775 


0.010000 


'2020', 


'2022'] 


50% 
5.559944e+06 
5.49307 4e+06 
5.307400e+06 
4.942770e+06 
4.292907e+06 
3.82541 0e+06 
3.141146e+06 
2.604830e+06 


8.119950e+04 
9.534675e+01 
1.175000e+02 


1.007900e+00 


7.000000e-02 


2. 
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In [12]: # creating dataframe for 'Continent' 


continent_df = df.groupby(by='Continent').sum() 
continent_df.head(3) 


Out [12]: 
Rank 2022 2020 2015 2010 2000 19€ 


Continent 
Africa 5253 1426730932 1360671810 1201102442 1055228072 818946032 6381506 
Asia 3878 4721383274 4663086535 4458250182 4220041327 3735089604 32105635; 


Europe 6225 743147538 745792196 741535608 735613934 726093423 72032075 


In [13]: £ creating dataframe for 'Country' 


country df = df.groupby(by='Country').sum() 
country df.head(3) 


Qut[13]: 
Rank 2022 2020 2015 2010 2000 1990 198 


Country 
Afghanistan 36 41128771 38972230 33753499 28189672 19542982 10694796 1248663 
Albania 138 2842321 2866849 2882481 2913399 3182021 3295066 294165 


Algeria 34 44903225 43451666 39543154 35856344 30774621 25518074 1873937 


Exploratory Data Analysis and Visualization 


World Population EDA 


In [14]: £ current world population 2022 


df['2022'].sum() 
Out[14]: 7973413042 


Current world population is 7,973,413,042 
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In [15]: # plotting world population trend since 1970 


plt.subplots(figsize-(10,5)) 

trend = df.iloc[:,5:13].sum() [::-1] 
sns.lineplot(x-trend.index, y-trend.values, marker="0") 
plt.xticks(rotation-45) 

plt.ylabel("Population") 

plt.title("World Population Trend (1970-2022)") 
plt.show() 


1e9 World Population Trend (1970-2022) 
8 ge 
7 
S 
56 
2 
2 
a 
£ 
5 
4 


In [16]: 
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# plotting current world population on map 


import pycountry 

countries = {} 

for country in pycountry.countries: 
countries[country.name] = country.alpha_3 


# get average of a list 
def Average(list): 
return sum(list) / len(list) 


df rc2022 = df.loc[:,["CCA3", "Country","2022"]1] 
df rc2022["CCA3"] = [countries.get(x, 'Unknown code') for x in df r 


fig = px.choropleth(df rc2022, locations="CCA3", 
hover. namez' Country", 
hover data-df rc2022.columns, 
Ccolors"2022", 
color continuous scale-'Viridis", 
range color-(min(df rc2022["2022"]), max(df. rc2 
projection="natural earth" 


) 


fig.update layout(margin-zi"r":5,"t":0,"1":5,"p":0]) 
fig.show() 
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In [17]: # let's see pie chart distribution for continent_df 


continent df['2022'].plot(kind = 'pie', figsize-(10,5), shadow=True 
plt.title(' Population Distribution by Continent') 
plt.axis('equal') 

plt.show() 


Population Distribution by Continent 


Africa 


Asia 


2022 


South America 


Oceania 


North America 


Europe 
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In [18]: # let's see world population by continent 


fig = px.bar(data_frame= df.groupby('Continent' , as_index= False). 


fig.update_layout(title= 'Current (2022) World Population per Conti 
fig. show() 
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In [19]: # let's see number of countries per continent 


df country-df['Continent'].value counts() 
fig-px.bar(x-df, country.index, 
y-df country.values, 
color-df country.index, 
color discrete sequence-px.colors.sequential.YlOrRd, 
text-df country.values, 
title- 'Number of Countries By Continent') 


fig.update layout(xaxis title-'"Countries", 
yaxis_title="Count") 


fig. show() 


Population Growth Rate 
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In [20]: # average population growth rate 


df['Growth Rate'].mean() 
Out[20]: 1.0095773504273504 


Since 1972 (50 years ago), the world population growth rate declined from around 296 
per year to under 1.096 per year. 
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In [21]: # plotting population Growth Rate on map 


fig = px.choropleth(df, 
locations='Country', 
locationmode='country names', 
color='Growth Rate', 
color continuous scale-'Viridis', 
template-'plotly', 
title = 'Growth Rate') 


fig.update layout(font = dict(size = 17, family="Gothic") ) 
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In [22]: # creating dataframe for top 10 countries with highest growth rate 
gwr top10 = df.sort_values(by='Country').sort_values(by='Growth Rat 


gwr_top10.head(3) 


Qut [22]; 
Rank CCA3 Country Capital Continent 2022 2020 2015 2010 
133 135 MDA Moldova Chisinau Europe 3272996 3084847 3277388 3678186 


164 37 POL Poland Warsaw Europe 39857145 38428366 38553146 38597353 


148 54 NER Niger Niamey Africa 26207977 24333639 20128124 16647543 


In [23]: # plotting top 10 highest growth rate countries in the last 30 year 


fig, ax = plt.subplots(figsize-(16,8)) 
plt.plot(gwr_top10['Country'], dwr top10['2020'], label='2020', mar 
plt.plot(gwr_top1@['Country'], dwr top10['1990'], label='1990', mar 


plt.xlabel('Country') 

plt.ylabel('Growth Rate') 

plt.grid(linewidth-0.3) 

plt.title('Top 10 Countries with Highest Growth Rate in the last 30 
plt.legend() 

plt.show() 


le7 Top 10 Countries with Highest Growth Rate in the last 30 years 


—e- 2020 
—* 1990 


Growth Rate 


T T T T T T T T T T 
Moldova Poland Niger Syria Slovakia DR Congo Mayotte Chad Angola Mali 
Country 


Population Decade-By-Decade Percent Change 
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In [24]: # creating dataframe for population difference decade-by-decade per 
pop diff = df.groupby('Continent')[['1970','1980', '1990', '2000', 


pop. diff.head(3) 


Out [24] : Continent 1970 1980 1990 2000 2010 2020 
0 Africa 365444348 — 481536377 638150629 818946032 1055228072 1360671810 
1 Asia 2144906290 2635334228 3210563577 3735089604 4220041327 4663086535 


2 Europe | 655923991 692527159 720320797 726093423 735613934 745792196 


In [25]: £ finding the population decade-by-decade percent change 


pop. diff['70s'] 
pop. diff['80s'] 
pop. diff['90s'] 
pop. diff['00s'] 
pop. diff['10s'] 


pop. diff['1970']/pop diff['1980']:100 
pop. diff['1980']/pop diff['1990']:100 
pop. diff['1990']/pop diff['2000']:100 
pop. diff['2000']/pop diff['2010']:100 
pop. diff['2010']/pop diff['2020']:100 


pop. diff.head(3) 


Out [25]: Continent 1970 1980 1990 2000 2010 2020 
0 Africa 365444348 481536377 638150629 818946032 1055228072 1360671810 
1 Asia 2144906290 2635334228 3210563577 3735089604 4220041327 4663086535 


2 Europe | 655923991 692527159 720320797 726093423 735613934 745792196 


In [26]: £ creating dataframe for decade-by-decade 
decade diff = pop diff.groupby('Continent')[['70s','80s', '90s', '0 


decade diff 


Out [26] : Continent 70s 80s 90s 00s 10s 
0 Africa 75.891327 75.458106 77.923404 77.608439 77.551990 
1 Asia 81.390295 82.083228 85.956802 88.508366 90.498885 


2 Europe 94.714551 96.141492 99.204975 98.705773 98.635242 
3 North America 85.647649 87.425282 86.667926 89.561653 91.330736 
4 Oceania 84.991562 85.702934 85.654845 84.152162 84.452244 


5 South America 79.799805 81.370326 84.987780 88.947756 91.089429 
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In [27]: # let's see decade_diff statistical summary quickly 


decade diff.describe() 


Dut [27]: 


count 
mean 
std 
min 
25% 
50% 
75% 


max 


70s 
6.000000 
83.739198 
6.454366 
75.891327 
80.197428 
83.190928 
85.483627 


94.714551 


80s 
6.000000 
84.696895 
6.966887 
75.458106 
81.548551 
83.893081 
86.994695 


96.141492 


90s 
6.000000 
86.732622 
6.898899 
71.923404 
85.154546 
85.805823 
86.490145 


99.204975 


00s 
6.000000 
87.914025 
6.941975 
71.608439 
85.241213 
88.728061 
89.408178 


98.705773 


10s 
6.000000 
88.926421 
7.163646 
77.551990 
85.963904 
90.794157 
91.270409 


98.635242 


In [28]: £ plotting wolrd population difference decade-by-decade percent cha 


fig, ax 


plt.subplots(figsize-(16,8)) 


plt.plot(decade diff['Continent'], decade diff['70s'], label='70s', 
plt.plot(decade diff['Continent'], decade diff['80s'], label='80s', 


plt.plot(decade diff['Continent'], decade diff['00s'], label-'00s', 
plt.plot(decade diff['Continent'], decade diff['10s'], label='10s', 
plt.grid(linewidth=0. 4) 
plt.title("World Population Difference Decade-By-Decade Percent Cha 
plt.xlabel('Continents') 
plt.ylabel('Population') 
plt.legend() 
plt.show() 


( 
( 
plt.plot(decade diff['Continent'], decade diff['90s'], label='90s', 
( 
( 


World Population Difference Decade-By-Decade Percent Change 1970s to 2010s 
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Area 


In [29]: # plotting total area distribution by continents 
import plotly.graph_objects as go 
df_cont= df['Continent'].unique() 
tot_area_cont = [] 


for each in df_cont: 
df area = df[df.Continent == each] 
area = sum(df area["Area (km?)"]) 
tot area cont.append(area) 


tot area cont - pd.DataFrame(tot area cont) 
df area = pd.DataFrame(df cont, columns = ["continent"]) 
df area["total"] = tot area cont 


fig = go.Figure(data-[go.Pie(labels-df area.continent, values-df ar 
insidetextorientation-'radial' 
)]) 

fig. show() 
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In [30]: # plotting Area distribution on map by country 


fig = px.choropleth(df, 
locations='Country', 
locationmode='country names', 
color='Area (km?)', 
color continuous scale-'Viridis', 
template-'plotly', 
title = 'Area (km?) ') 


fig.update layout(font = dict(size = 17, family="Gothic") ) 


Top 10 Countries With Most Population 
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Firstly, let's copy our dataframe 'df' to 'df_copy' so we can place 'Country' as the index 
to avoid affecting other analysis negatively. 


In [31]: # copying dataframe 'df' to 'df copy' 


df copy = df.copy() 
df copy.head(3) 


Qut[31]: 
Rank CCA3 Country Capital Continent 2022 2020 2015 2010 
0 36 AFG Afghanistan Kabul Asia 41128771 38972230 33753499 28189672 
1 138 ALB Albania Tirana Europe 2842321 2866849 2882481 2913399 
2 34 DZA Algeria Algiers Africa 44903225 43451666 39543154 35856344 


In [32]: df_copy.set_index('Country', inplace=True) 


In [33]: df_copy.sort_values(by='2022', ascending=True, inplace=True) 


df_top10 = df_copy['2022'].tail(10) 


df top10 

0ut[33]: Country 
Mexico 127504125 
Russia 144713314 
Bangladesh 171186372 
Brazil 215313498 
Nigeria 218541212 
Pakistan 2358248062 
Indonesia 275501339 
United States 338289857 
India 1417173173 
China 1425887337 


Name: 2022, dtype: int64 
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In [34]: # plotting top 10 MOST populated countries 
df_top1@.plot(kind='barh', figsize=(10, 10), color='darkblue' ) 
plt.xlabel('Population') 
plt.title('Top 10 Countries With MOST Population 2022') 


plt.show() 
Top 10 Countries With MOST Population 2022 
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In [35]: # plotting top 10 population trend 


inplace = True 
df_copy.sort_values(by='2022', ascending=False, axis-0, inplace=Tru 


df top 10 


df copy.head(10) 


df top 10 


df top 10[years].transpose() 


df top 10.index = df top 10.index.map(int) 
df top 10.plot(kind-'line', figsize=(14, 8)) 


plt.title('Trend of Top 10 MOST Populated Countries') 
plt.ylabel('Population') 

plt.xlabel('Years') 

plt.show() 
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In [36]: df_copy.sort_values(by='2022', ascending-True, inplace=True) 


df btm10 = df_copy['2022'].head(10) 


df btm10 

Out[36]: Country 
Vatican City 510 
Tokelau 1871 
Niue 1934 
Falkland Islands 3780 
Montserrat 4390 
Saint Pierre and Miquelon 5862 
Saint Barthelemy 10967 
Tuvalu 11312 
Wallis and Futuna 11572 
Nauru 12668 


Name: 2022, dtype: int64 


In [37]: df_btm10.plot(kind='barh', figsize=(10, 10), color='darkred') 
plt.xlabel('Population') 
plt.title('Top 10 Countries With LEAST Population 2022') 


plt.show() 
Top 10 Countries With LEAST Population 2022 
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In [38]: # top 10 countries with least population trend. 


inplace = True 
df_copy.sort_values(by='2022', ascending=False, axis-0, inplace=Tru 


df bttm10 


df bttm10 


df bttm10. 
df bttm10. 


df copy.tail(10) 


df bttm10[years].transpose() 


index = df bttm10.index.map(int) 
plot(kind='line', figsize=(14, 8)) 


plt.title('Trend of Top 10 Countries with LEAST population') 
plt.ylabel('Populaton') 

plt.xlabel('Years') 

plt.show() 
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Top 5 Most Populated Countries By Continents 


Here, we don't need to copy our dataframe or use the copied dataframe. We will use our 
original dataframe 'df' in this section 
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In [39]: # creating dataframes for countries per continent 


# Asia 
asian countries = df.loc[df["Continent"]=="Asia"].sort_values(by=[" 


# Africa 
african countries = df.loc[df["Continent"]=="Africa"].sort_values(b 


# Europe 
european countries = df. loc[df["Continent"]=="Europe"] .sort_values ( 


# North America 
na countries = df. loc[df["Continent"]=="North America"].sort values 


# Oceania 
oc countries = df.loc[df ["Continent"]=="Oceania"].sort_values(by=[" 


# South America 
sa countries = df.loc[df["Continent"]=="South America"].sort values 
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In [40]: 


Out [40]: 


# plotting top 5 MOST populated countries by continent 


# Asian countries 
asian countries[["Country", "2022"]].sort values(byz"2022", ascendi 


# African countries 
african countries[["Country", "2022"]].sort values(byz"2022", ascen 


# European countries 
european countries[["Country", "2022"]].sort_values(by="'2022", asce 


# North American countries 
na_countries[["Country", "2022"]].sort_values(by="2022", ascending= 


# Oceanian countries 
oc countries[["Country", "2022"]].sort values(byz"2022", ascending= 


# South American countries 
sa countries[["Country", "2022"]].sort values(byz"2022", ascending- 


«AxesSubplot:title-i'center':'South America Top 5 MOST Populated C 
ountries'}, xlabel='Country', ylabel='Population '> 
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In [41]: 


Out [41]: 


lotting top 5 LEAST populated countries by continent 


sian countries 
an countries[["Country", "2022"]].sort values(by-"2022", ascending=F 


frican countries 
ican countries[["Country", "2022"]].sort_values(by="2022", ascending 


uropean countries 
opean countries[["Country", "2022"]].sort values(byz"2022", ascendin 


orth American countries 
countries[["Country", "2022"]].sort values(byz"2022", ascending=Fals 


ceanian countries 
countries[["Country", "2022"]].sort values(byz"2022", ascending=Fals 


outh American countries 
countries[["Country", "2022"]].sort values(by-"2022", ascending=Fals 
«AxesSubplot:title-i'center':'South America Top 5 LEAST Populated 


Countries'}, xlabel='Country', ylabel='Population'> 
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In [42]: # current world population 


Out [42]: 


In [43]: 


Out [43]: 


df['2022'].sum() 


7973413042 


World current population: 7.9 billion 


# relationship betewen years and total population, we will convert | 


df tot 


df tot. 
dT 
dT 


df_ 


0 


Aà OO N 


tot. 


tot. 


tot. 


year 


1970 


1980 


1990 


2000 


2010 


= pd.DataFrame(df[years].sum(axis=0)) # use the sum() method 
index = map(int, df_tot.index) # change the years to type in 
reset_index(inplace = True) # reset the index to put in back 
columns = ['year', 'total'] # rename columns 

head() 


total 
3694136661 
4442400371 
5314191665 
6147055703 


6983784998 
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In [44]: # plotting a scatter plot for year vs total population 
df_tot.plot(kind='scatter', x='year', y='total', figsize=(10, 6), c 
plt.title('Current Wolrd Population' ) 
plt.xlabel('Year') 
plt.ylabel('Population') 


plt.show() 
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In [45]: £ fitting our data 


x = df tot['year'] 

y = df tot['total'] 

fit = np.polyfit(x, y, deg=1) 
fit 


Out[45]: array([ 8.33710451e+07, -1.60587660e+11] ) 
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In [46]: # plotting the regression line on the scatter plot 


Out [46]: 
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df_tot.plot(kind='scatter', x='year', y='total', figsize=(10, 6), c 
plt.title('World Population 1970 - 2022') 

plt.xlabel('Year') 

plt.ylabel('Population') 

# plot line of best fit 


plt.plot(x, fit[0] * x + fit[1], color='red') # recall that x is th 
plt.annotate('y210:.0f) x + {1:.0f}'.format(fit[0], fit[1]), xy=(20 


plt.show() 


# print out the line of best fit 
'World Population = 10:.0f) x Year + {1:.0f}'.format(fit[0], fit[1] 
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In [47]: # current asia population 


asian countries['2022'].sum() 


Out[47]: 4721383274 


Asia current population: 4.7 billion 


In [48]: df a230 = df[(df['Continent'] == 'Asia')] 


12/03/23, 10:00 PM 


# relationship betewen years and total asia population, we will con 


df asia tot = pd.DataFrame(df a230[years].sum(axis-0)) 


df asia tot.index - map(int, df asia tot.index) 


df asia tot.reset index(inplace - True) 


df asia tot.columns = ['year', 'total'] 


df asia tot.tail() 


Out [48]: 


3 


4 


year 


2000 


2010 


2015 


2020 


2022 


total 


3735089604 


4220041327 


4458250182 


4663086535 


4721383274 
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In [49]: 


In [50]: 


Out [50] : 
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df asia tot.plot(kind-2'scatter', x='year', y='total', figsize-(10, 


plt.title('Asian Population') 
plt.xlabel('Year') 
plt.ylabel('Population') 


plt.show() 
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# fitting asian data 


x as = df asia tot['year'] 

y .as = df asia tot['total'] 

fit as - np.polyfit(x as, y as, deg-1) 
fit as 


array([ 5.03219504e+07, —9.69643506e+10] ) 
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In [51]: 


Out [51]: 


In [52]: 


Gut [52]: 


# plotting Asian Populaion regression line on the scatter plot 
df_asia_tot.plot(kind='scatter', x='year', y='total', figsize=(10, 
plt.title('Asian Population 1970 - 2022') 

plt.xlabel('Year') 

plt.ylabel('Population') 

# plot line of best fit 

plt.plot(x_as, fit_as[@] * x_as + fit_as[1], color='red') # recall 
plt.annotate('y2(0:.0f) x + {1:.0f}'.format(fit_as[0], fit as[11), 
plt.show() 


# print out the line of best fit 
‘Asian Population = 10:.0f) x Year + {1:.0f}'.format(fit_as[0], fit 
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‘Asian Population = 50321950 x Year + -96964350561' 


Africa 2030 Population Projection 


# current africa population 


african countries['2022'].sum() 


1426730932 


Africa current population: 1.4 billion 
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In [53]: df_af230 = df[(df['Continent'] == 'Africa')] 
# relationship betewen years and total asia population, we will con 
df afri tot = pd.DataFrame(df, af230[years].sum(axis-0)) 
df afri tot.index - map(int, df afri tot.index) 
df afri tot.reset index(inplace - True) 
df afri tot.columns = ['year', 'total'] 
df afri tot.tail() 


Dut [53]: 


year total 
3 2000 818946032 
4 2010 1055228072 
5 2015 1201102442 
6 2020 1360671810 


7 2022 1426730932 
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In [54]: df_afri_tot.plot(kind='scatter', x='year', y='total', figsize=(10, 
plt.title('African Population') 
plt.xlabel('Year') 
plt.ylabel('Population') 


plt.show() 
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In [55]: £ fitting african data 


x af = df afri tot['year'] 

y af = df afri tot['total'] 

fit af = np.polyfit(x af, y af, deg-1) 
fit af 


Out[55]: array([ 2.06561727e+07, -4.04119432e410]) 
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In [55b]: 


Out [56]: 


In [57]: 


Qut[57]: 


# plotting African Populaion regression line on the scatter plot 
df afri tot.plot(kind2'scatter', x='year', y='total', figsize-(10, 
plt.title('African Population 1970 - 2022') 

plt.xlabel('Year') 

plt.ylabel('Population') 

# plot line of best fit 

plt.plot(x af, fit af[0] x x af + fit af[1], color='red') # recall 
plt.annotate('y={@:.@f} x + {1:.0f}'.format(fit_af[0], fit af[1]), 
plt.show() 


# print out the line of best fit 
'African Population = (0:.0f) + Year + (1:.0f?)'.format(fit af[0], f 


1e9 African Population 1970 - 2022 
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‘African Population = 20656173 * Year + -40411943187' 


Europe 2030 Population Projection 


# current europe population 


european_countries['2022'].sum() 


743147538 


Europe current population: 743 million 
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In [58]: df_eu230 = df[(df['Continent'] == 'Europe')] 
# relationship betewen years and total asia population, we will con 
df eu tot = pd.DataFrame(df eu230[years].sum(axis-0)) 
df eu tot.index = map(int, df eu tot.index) 
df eu tot.reset index(inplace - True) 
df eu tot.columns = ['year', 'total'] 
df eu tot.tail() 


Out [58]: 


year total 
3 2000 726093423 
4 2010 735613934 
5 2015 741535608 
6 2020 745792196 


7 2022 743147538 
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In [59]: df_eu_tot.plot(kind='scatter', x='year', y='total', figsize=(10, 6) 
plt.title('European Population') 
plt.xlabel('Year') 
plt.ylabel('Population') 


plt.show() 
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In [60]: £ fitting european data 


x eu = df eu tot['year'] 

y eu = df eu tot['total'] 

fit eu = np.polyfit(x eu, y. eu, deg=1) 
fit eu 


Out[60]: array([ 1.52598839e+06, -2.33319268e409]) 
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In [61]: 


Out [61]: 


In [62]: 


Out [62]: 


# plotting European Populaion regression line on the scatter plot 
df_eu_tot.plot(kind='scatter', x='year', y='total', figsize=(10, 6) 
plt.title('European Population 1970 - 2022') 

plt.xlabel('Year') 

plt.ylabel('Population') 

# plot line of best fit 

plt.plot(x eu, fit eu[0] * x eu + fit eu[1], color='red') # recall 
plt.annotate('y2(0:.0f) x + {1:.0f}'.format(fit_eu[0], fit eu[11), 
plt.show() 


# print out the line of best fit 
‘European Population = (0:.0f) x Year + (1:.0f)'.format(fit eu[0], 
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‘European Population = 1525988 * Year + -2333192682' 


North America 2030 Population Projection 


# current north america population 


na countries['2022'].sum() 


600296136 


North America current population: 600 million 
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In [63]: df na230 = df[(df['Continent'] == 'North America')] 
# relationship betewen years and total asia population, we will con 
df na tot = pd.DataFrame(df na230[years].sum(axis-0)) 
df na tot.index = map(int, df na tot.index) 
df na tot.reset index(inplace - True) 
df na tot.columns = ['year', 'total'] 
df na tot.tail() 


Out [63]: 


year total 
3 2000 486069584 
4 2010 542720651 
5 2015 570383850 
6 2020 594236593 


7 2022 600296136 
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In [64]: df_na_tot.plot(kind='scatter', x='year', y='total', figsize=(10, 6) 
plt.title('North American Population’) 
plt.xlabel('Year') 
plt.ylabel('Population') 


plt.show() 
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In [65]: # fitting north american data 


x na = df na tot['year'] 

y na = df na tot['total'] 

fit na = np.polyfit(x na, y. na, deg=1) 
fit na 


Out[65]: array([ 5.61735693e+06, -1.07522914e+10] ) 
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In [66]: 


Out [66]: 


In [67]: 


Out [67]: 


# plotting North American Populaion regression line on the scatter , 
df_na_tot.plot(kind='scatter', x='year', y='total', figsize=(10, 6) 
plt.title('North American Population 1970 - 2022') 
plt.xlabel('Year') 

plt.ylabel('Population') 

# plot line of best fit 

plt.plot(x na, fit na[0] * x na + fit na[1], color='red') # recall 
plt.annotate('y={@:.@f} x + {1:.0f}'.format(fit_na[0], fit na[11]), 
plt.show() 


# print out the line of best fit 
'North American Population = 10:.0f) x Year + {1:.O0f}'.format(fit_n 
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‘North American Population = 5617357 * Year + -10752291387' 


Oceania 2030 Population Projection 


# current oceania population 


oc_countries['2022'].sum() 


45038554 


Oceania current population: 45 million 
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In [68]: df. 0c230 = df[(df['Continent'] == 'Oceania')] 
# relationship betewen years and total asia population, we will con 
df oc tot = pd.DataFrame(df_oc230[years] . sum(axis=0) ) 
df_oc_tot.index = map(int, df_oc_tot. index) 
df_oc_tot.reset_index(inplace = True) 
df_oc_tot.columns = ['year', 'total'] 
df_oc_tot.tail() 


Out [68]: 


year total 
3 2000 31222778 
4 2010 37102764 
5 2015 40403283 
6 2020 43933426 


7 2022 45038554 
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In [69]: df_oc_tot.plot(kind='scatter', x='year', y='total', figsize=(10, 6) 
plt.title('Oceanian Population') 
plt.xlabel('Year') 
plt.ylabel('Population') 


plt.show() 
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In [70]: £ fitting oceania data 


x oc = df oc tot['year'] 

y. Oc = df oc tot['total'] 

fit oc = np.polyfit(x oc, y oc, deg-1) 
fit oc 


Out[70]: array([ 5.00543813e+05, -9.68169960e-08]) 
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In [71]: 


Out [71] : 


In [72]: 


Out [72] : 


# plotting Oceanian Populaion regression line on the scatter plot 
df oc tot.plot(kind2'scatter', x='year', y='total', figsize=(10, 6) 
plt.title('Oceanian Population 1970 - 2022') 

plt.xlabel('Year') 

plt.ylabel('Population') 

# plot line of best fit 

plt.plot(x oc, fit oc[0] * x oc + fit oc[1], color='red') # recall 
plt.annotate('y2(0:.0f) x + (1:.0f)'.format(fit oc[0], fit oc[11), 
plt.show() 


# print out the line of best fit 
‘Oceanian Population = (0:.0f) x Year + (1:.0f)'.format(fit oc[0], 
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'Oceanian Population = 500544 * Year + -9681699060' 


South America 2030 Population Projection 


# current south american population 


sa countries['2022'].sum() 


436816608 


South America current population: 436 million 
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In [73]: df_sa230 = df[(df['Continent'] == 'South America')] 


12/03/23, 10:00 PM 


# relationship betewen years and total asia population, we will con 


df sa tot = pd.DataFrame(df_sa230[years] .sum(axis=0) ) 


df_sa_tot. index 


map(int, df_sa_tot. index) 


df sa tot.reset index(inplace = True) 


df sa tot.columns = ['year', 'total'] 


df sa tot.tail() 


Qut[73]: 


year 
3 2000 
4 2010 
5 2015 
6 2020 


7 2022 
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393078250 
413134396 
431530043 
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In [74]: df_sa_tot.plot(kind='scatter', x='year', y='total', figsize=(10, 6) 
plt.title('South American Population’) 
plt.xlabel('Year') 
plt.ylabel('Population') 


plt.show() 
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In [75]: # fitting south america data 


x sa = df. sa tot['year'] 

y. sa = df. sa tot['total'] 

fit sa = np.polyfit(x sa, y. sa, deg=1) 
fit sa 


Out[75]1: array([ 4.74903293e+06, -9.15771175e+09] ) 
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In [76]: # plotting South American Populaion regression line on the scatter | 
df_sa_tot.plot(kind='scatter', x='year', y='total', figsize=(10, 6) 
plt.title('South American Population 1970 - 2022') 
plt.xlabel('Year') 
plt.ylabel('Population') 

# plot line of best fit 


plt.plot(x sa, fit sa[0] * x sa + fit sa[1], color='red') # recall 
plt.annotate('y2(0:.0f) x + (1:.0f)'.format(fit sa[0], fit sa[11), 


plt.show() 


# print out the line of best fit 
‘South American Population = 10:.0f) x Year + {1:.0f}'.format(fit_s 
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Out[76]: 'South American Population = 4749033 x Year + -9157711746' 


Canada 2026 Population Census Projection 


In [77]: df can = df[(df['Country'] == 'Canada')] 


df can['2022'].sum() 


0ut[77]: 38454327 
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Canada current population: 38 million 
Last population census: 2021 


Next population census: 2026 


In [78]: # relationship betewen years and total population, we will convert | 
df can tot = pd.DataFrame(df can[years].sum(axis-0)) 
df can tot.index = map(int, df can tot.index) 
df can tot.reset index(inplace - True) 
df can tot.columns = ['year', 'total'] 


df can tot.tail() 
Out [78]: 


year total 
3 2000 30683313 
4 2010 33963412 
5 2015 35732126 
6 2020 37888705 


7 2022 38454327 
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In [79]: df_can_tot.plot(kind='scatter', x='year', y='total', figsize=(10, 6 
plt.title('Canadian Population') 
plt.xlabel('Year') 
plt.ylabel('Population') 
plt.show() 
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In [80]: x can = df can tot['year'] 
y. can = df can tot['total'] 
fit can = np.polyfit(x can, y can, deg-1) 


fit can 


Out[80]: array([ 3.26758383e+05, -6.22512033e+08] ) 
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In [81]: # plot the Canadian Populaion regression line on the scatter plot 
df_can_tot.plot(kind='scatter', x='year', y='total', figsize=(10, 6 
plt.title('Canadian Population 1970 - 2022') 
plt.xlabel('Year') 
plt.ylabel('Population') 

# plot line of best fit 

plt.plot(x can, fit can[0] * x can + fit can[1], color='red') # rec 
plt.annotate('y={@:.@f} x + {1:.0f}'.format(fit_can[@], fit can[11) 
plt.show() 


# print out the line of best fit 
‘Canadian Population = (0:.0f) * Year + (1:.0f)'.format(fit can[0], 
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Out[81]: 'Canadian Population = 326758 x Year + -622512033' 


Ireland 2027 Population Census Projection 


In [82]: df ir = df[(df['Country'] == 'Ireland')] 


df ir['2022'].sum() 


Out[82]: 5023109 
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Ireland current population: 5 million 
Last population census: 2022 


Next population census: 2027 


In [83]: £ relationship betewen years and total population, we will convert 

df ir tot = pd.DataFrame(df ir[years].sum(axis-0)) 

df ir tot.index = map(int, df ir tot.index) 

df ir tot.reset index(inplace - True) 

df ir tot.columns = ['year', 'total'] 

df ir tot.tail() 
Out [83]: year total 
3 2000 3768950 
4 2010 4524585 
5 2015 4665760 
6 2020 4946119 


7 2022 5023109 
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In [84]: df_ir_tot.plot(kind='scatter', x='year', y='total', figsize=(10, 6) 
plt.title('Ireland Population') 
plt.xlabel('Year') 
plt.ylabel('Population') 


plt.show() 
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In [85]: x ir = df. ir tot['year'] 
y ir = df ir tot['total'] 
fit ir = np.polyfit(x ir, y ir, deg=1) 


fit ir 


Out[85]: array([ 4.04177619e+04, -7.67780243e407]) 
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In [86]: # plot the Ireland Populaion regression line on the scatter plot 
df_ir_tot.plot(kind='scatter', x='year', y='total', figsize=(10, 6) 
plt.title('Ireland Population 1970 - 2022') 
plt.xlabel('Year') 
plt.ylabel('Population') 

# plot line of best fit 
plt.plot(x ir, fit ir[0] * x ir + fit ir[1], color='red') # recall 
plt.annotate('yz(0:.0f) x + {1:.0f}'.format(fit_ir[0], fit ir[1]), 


plt.show() 


# print out the line of best fit 
'Ireland Population = 10:.0f) x Year + (1:.0f?'.format(fit ir[0], f 
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Out[86]: 'Ireland Population = 40418 x Year + -76778024' 


Australia 2026 Population Census Projection 


In [87]: df au = df[(df['Country'] == 'Australia')] 


df_au['2022'].sum() 
Out [87]: 26177413 
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Australia current population: 26 million 
Last population census: 2021 


Next population census: 2026 


In [88]: £ relationship betewen years and total population, we will convert | 
df au tot = pd.DataFrame(df au[years].sum(axis-0)) 
df au tot.index = map(int, df au tot.index) 
df au tot.reset index(inplace - True) 
df au tot.columns = ['year', 'total'] 


df au tot.tail() 
Out [88]: 


year total 
3 2000 19017963 
4 2010 22019168 
5 2015 23820236 
6 2020 25670051 


7 2022 26177413 
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In [89]: df_au_tot.plot(kind='scatter', x='year', y='total', figsize=(10, 6) 
plt.title('Australia Population') 
plt.xlabel('Year') 
plt.ylabel('Population') 


plt.show() 
le7 Australia Population 


26 
24 
22 


20 


Population 


18 
16 
14 


12 
1970 1980 1990 2000 2010 2020 


In [90]: x au = df_au_tot['year'] 
y. au = df au tot['total'] 
fit au = np.polyfit(x au, y au, deg=1) 


fit au 


Out[90]: array([ 2.62563289e405, -5.05224546e408]) 
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In [91]: # plot the Australia Populaion regression line on the scatter plot 
df_au_tot.plot(kind='scatter', x='year', y='total', figsize=(10, 6) 
plt.title('Australia Population 1970 - 2022') 
plt.xlabel('Year') 
plt.ylabel('Population') 

# plot line of best fit 

plt.plot(x au, fit au[0] * x au + fit au[1], color='red') # recall 
plt.annotate('y2(0:.0f) x + {1:.0f}'.format(fit_au[0], fit au[11), 
plt.show() 


# print out the line of best fit 
‘Australia Population = 1(0:.0f) + Year + (1:.0f)'.format(fit au[0], 
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Out[91]: 'Australia Population = 262563 * Year + -505224546' 


Comparing China and India Population 


http://localhost:8888/notebooks/Documents/WorldPouliton/world-population-eda-and-prediction-python.ipynb Page 59 of 64 


world-population-eda-and-prediction-python - Jupyter Notebook 


In [92]: 


# let's compare China and India population 


# creating dataframe for both countries 


df CI = df_copy.loc[['China', 


df_CI 
Out [92]: 


Country 


1970 1980 1990 
China 822534450 982372466 1153704252 
557501301 696828385 870452165 


India 


2000 


1264099069 


1059633675 


In [93]: £ let's transpose the dataframe 'df CI' 


df CI = df CI.transpose() 
df CI.head() 


Out[93]: country 


1970 
1980 
1990 
2000 


2010 


China 


822534450 


982372466 


1153704252 


1264099069 


1348191368 


India 


557501301 


696828385 


870452165 


1059633675 


1240613620 


'India'], years] 


2010 


1348191368 


1240613620 


In [94]: # checking out the statistical summary quickly 


df CI.describe() 


Out[94]: Country 


count 
mean 


std 


min 
25% 
50% 
75% 


max 


China 
8.000000e+00 
1.226929e+09 
2.240527e+08 
8.225344e+08 
1.110871e+09 
1.306145e+09 
1.401519e+09 


1.425887 e+09 


India 
8.000000e+00 
1.070182e+09 
3.299235e+08 
5.575013e+08 
8.270462e408 
1.150124e+09 
1.341247e+09 


1.417173e+09 
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2015 P 


1393715448 


1322866505 


142492€ 


139638; 
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In [95]: # let's perform a side by side comparison of the box plot with the 


fig = plt.figure() 
ax® = fig.add subplot(1, 2, 1) 
ax1 = fig.add subplot(1, 2, 2) 


# Subplot 1: Box plot 


df_CI.plot(kind='box', color='blue', vert-False, figsize-(20, 6), a 
ax0.set title('Box Plots of China and India Population (1970 - 2022 


ax0.set xlabel('Population') 
ax0.set ylabel('Countries') 


# Subplot 2: Line plot 
df CI.plot(kind-'line', figsize-(20, 6), ax=ax1) 


ax1.set title ('Line Plots of China and India population (1970 - 20 


ax1.set ylabel('Population') 
ax1.set xlabel('Years') 


plt.show() 
Box Plots of China and India Population (1970 - 2022) le9 Line Plots of China and India population (1970 - 2022) 
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In [96]: df_norm = df_copylyears].transpose() # transposed dataframe 
df norm, index = map(int, df norm.index) # cast the Years (the index 


# let's label the index. This will automatically be the column name 
df norm.index.name = 'Year' 


df norm.reset index(inplace-True) £ reset index to bring the Year i 


df norm.head(3) 


Out [96]: 


Country Year China India unnan Indonesia Pakistan Nigeria E 
States 


0 1970 822534450 557501301 200328340 115228394 59290872 55569264 963€ 
1 1980 982372466 696828385 223140018 148177096 80624057 72951439 1222€ 


2 1990 1153704252 870452165 248083732 182159874 115414069 95214257 1507C 


3 rows x 235 columns 


In [97]: # normalize China data 
norm china = (df norm['China'] - df norm['China'].min()) / (df norm 


# normalize India data 
norm india - (df norm['India'] - df norm['India'].min()) / (df norm 


In [98]: 
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# plotting the graph for normalized China - India 


import matplotlib.pyplot as plt 
*smatplotlib inline 

import matplotlib as mpl 
mpl.style.use(['ggplot']) 
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i China 
ax0 = df_norm.plot(kind='scatter', 
xs'Year', 
y='China', 
figsize=(14, 8), 
alpha=0.5, # transparency 
colors'green', 
s-norm china * 2000 + 10, # pass in weights 
xlim-(1970, 2022) 
) 
# India 
ax1 = df_norm.plot(kind='scatter', 
xs'Year', 
y='India', 
alpha=0.5, 


color="blue", 
s-norm india * 2000 + 10, 
ax-ax0 


) 


ax0.set ylabel('Population') 
ax0.set title('Top 2 Population 1970 to 2022') 


ax0.legend(['China', 'India'], loc='upper left', fontsize='x-large' 


Out[98]: <matplotlib. legend.Legend at @x7f5eb6893d90> 


1e9 Top 2 Population 1970 to 2022 


Population 


Year 
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In []: 
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